Lab 2 - Linear Regression

1 - Check we have required libraries


In [ ]:
# Pandas Data Library
!pip install --upgrade pandas

In [ ]:
# Matplotlib Visualization
!pip install --upgrade matplotlib

In [ ]:
# Statsmodel LM library
!pip install --upgrade statsmodels

2 - Import notebook libraries


In [ ]:
# import notebook libraries
import pandas as pd
import matplotlib.pyplot as plt

# this allows plots to appear directly in the notebook
%matplotlib inline

3 - Load Data Set


In [ ]:
# read data into a Pandas DataFrame
data = pd.read_csv('http://www-bcf.usc.edu/~gareth/ISL/Advertising.csv', index_col=0)
data.head()

In [ ]:
# Shape of the data frame
data.shape

4 - Visualize data using scatterplot


In [ ]:
# visualize the relationship between the features using scatterplots
fig, axs = plt.subplots(1, 3, sharey=True)

data.plot(kind='scatter', x='TV', y='sales', ax=axs[0], figsize=(16, 8))
data.plot(kind='scatter', x='radio', y='sales', ax=axs[1])
data.plot(kind='scatter', x='newspaper', y='sales', ax=axs[2])

5 - Create Linear Regression


In [ ]:
# Standard LM Library
import statsmodels.formula.api as smf

# create a fitted model in one line
lm = smf.ols(formula='sales ~ TV', data=data).fit()

# print the coefficients
lm.params

6 - Prediction


In [ ]:
# you have to create a DataFrame
X_new = pd.DataFrame({'TV': [50]})
X_new.head()

In [ ]:
# use the model to make predictions on a new value
lm.predict(X_new)

In [ ]:
# manually calculate the prediction
7.032594 + 0.047537*50

7 - Analysis


In [ ]:
# Create new data frame with min / max values
X_new = pd.DataFrame({'TV': [data.TV.min(), data.TV.max()]})
X_new.head()

In [ ]:
# make predictions for those x values and store them
preds = lm.predict(X_new)
preds

In [ ]:
# first, plot the observed data
data.plot(kind='scatter', x='TV', y='sales')

# then, plot the least squares line
plt.plot(X_new, preds, c='red', linewidth=2)